{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "53e4bcce",
   "metadata": {},
   "source": [
    "MEASURE OF POPULISM FOR CONGRESS \n",
    "\n",
    "including for different dimensions and by topic "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a7daeb12",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PACKAGES #############################################################\n",
    "\n",
    "import string\n",
    "import re\n",
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "from gensim.parsing.preprocessing import remove_stopwords\n",
    "from gensim.parsing.preprocessing import stem_text\n",
    "from nltk.corpus import wordnet as wn\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from lexical_diversity import lex_div as ld\n",
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "\n",
    "# DICTIONARY #############################################################\n",
    "# dizionario pop PAUWELS - Already modified\n",
    "\n",
    "\n",
    "# Virtuous people\n",
    "people = ['peopl', 'tradit', 'tradition', 'direct', 'directli', 'referendum']\n",
    "\n",
    "# Corrupt elite\n",
    "elite = ['cast', 'class', 'elit', 'elitist', 'establish',\n",
    "         'polit', 'politic', 'politician', 'corrupt', 'regim',\n",
    "         'regimen', 'rule', 'propaganda', 'directori', 'promin',\n",
    "         'arrog', 'arrogantli', 'betrai', 'treason', 'promis', 'shame',\n",
    "        'undemocrat', 'deceit', 'absurd', 'absurdli', 'admit', 'admitt']\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a72d9489",
   "metadata": {},
   "source": [
    "UPLOAD TEXT "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cea2e9f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('/Users/gloria/Dropbox/Progetti/Rhetoric/dta/legislative')\n",
    "df = pd.read_excel('legislative2018_main.xlsx', sheet_name='txt')\n",
    "df = df[['Name', 'district', 'state'] + [col for col in df.columns if col.startswith('topic_')]]\n",
    "df['district'] = df['district'].str.split().str[:-1].str.join(' ')\n",
    "df['state_name'] = np.where(df['state'].isna(), df['district'], df['state'])\n",
    "\n",
    "del df['district']\n",
    "del df['state']\n",
    "\n",
    "# df = df.dropna(axis=1, how='all')\n",
    "df['election_year'] = 2018"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b5c8157c",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(\"/Users/gloria/Dropbox/Progetti/Rhetoric/Raw data/Congress 2020\")\n",
    "df2 = pd.read_excel('dataset_house.xlsx', sheet_name='txt')\n",
    "df2 = df2[['Name', 'state'] + [col for col in df2.columns if col.startswith('topic_')]]\n",
    "df2.rename(columns={'state': 'state_name'}, inplace=True)\n",
    "# df2 = df2.dropna(axis=1, how='all')\n",
    "df2['election_year'] = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "04bb0fdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df, df2])\n",
    "del(df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32272b6a",
   "metadata": {},
   "source": [
    "INDENTIFY ECONOMICS TOPICS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9d6ec54f",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_topics = [var for var in list(df) if var.startswith('topic_')]\n",
    "econ_topics = ['topic_4'] # Economics\n",
    "econ_ext_topics  = ['topic_4', 'topic_11', 'topic_21']  # Economics + Welfare + Labour groups\n",
    "other_topics = [var for var in all_topics if var not in econ_ext_topics]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9378aea9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['econ'] = df[econ_topics]\n",
    "df['econ_ext'] = df[econ_ext_topics].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)\n",
    "df['other'] = df[other_topics].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)\n",
    "df['all'] = df[all_topics].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)\n",
    "\n",
    "df_stored = df[['Name', 'state_name', 'election_year', 'econ', 'econ_ext', 'other', 'all']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bed3cd33",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = df_stored[['Name', 'state_name', 'election_year', 'econ']]\n",
    "df2 = df_stored[['Name', 'state_name', 'election_year', 'econ_ext']]\n",
    "df3 = df_stored[['Name', 'state_name', 'election_year', 'other']]\n",
    "df4 = df_stored[['Name', 'state_name', 'election_year', 'all']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b57ae979",
   "metadata": {},
   "source": [
    "CLEAN TEXT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e27a7309",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text_data(df):\n",
    "    \n",
    "    df = df.copy() \n",
    "    \n",
    "    df.columns = ['name', 'state_name', 'election_year', 'text']\n",
    "\n",
    "    # Lowercase conversion and newline removal\n",
    "    df['cleaned_text'] = df['text'].str.lower().str.replace('\\n', '', regex=False)\n",
    "\n",
    "    df['len_words'] = df['text'].str.split().str.len().fillna(0)\n",
    "\n",
    "    # Remove punctuation\n",
    "    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if pd.notna(x) else x)\n",
    "\n",
    "    # Remove digits\n",
    "    df['cleaned_text'] = df['cleaned_text'].str.replace('\\d+', '', regex=True)\n",
    "\n",
    "    # Apply stopwords removal and stemming - Ensure these functions can handle NaN values gracefully\n",
    "    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: remove_stopwords(x) if pd.notna(x) else x)\n",
    "    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: stem_text(x) if pd.notna(x) else x)\n",
    "\n",
    "    # Add length of cleaned text\n",
    "    df['len_words_cleaned'] = df['cleaned_text'].str.split().str.len().fillna(0)\n",
    "\n",
    "    return df\n",
    "\n",
    "\n",
    "df1 = clean_text_data(df1)\n",
    "df2 = clean_text_data(df2)\n",
    "df3 = clean_text_data(df3)\n",
    "df4 = clean_text_data(df4) # for the tfidf vectoriser"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "725d8c6f",
   "metadata": {},
   "source": [
    "FIND FREQUENCIES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f47a4d25",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>state_name</th>\n",
       "      <th>election_year</th>\n",
       "      <th>text</th>\n",
       "      <th>cleaned_text</th>\n",
       "      <th>len_words</th>\n",
       "      <th>len_words_cleaned</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bradley Byrne</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>2018</td>\n",
       "      <td>As your Congressman, I have made great progres...</td>\n",
       "      <td>congressman great progress import local infras...</td>\n",
       "      <td>698</td>\n",
       "      <td>328</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            name state_name  election_year  \\\n",
       "0  Bradley Byrne    Alabama           2018   \n",
       "\n",
       "                                                text  \\\n",
       "0  As your Congressman, I have made great progres...   \n",
       "\n",
       "                                        cleaned_text  len_words  \\\n",
       "0  congressman great progress import local infras...        698   \n",
       "\n",
       "   len_words_cleaned  \n",
       "0                328  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df4.head(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12a05c50",
   "metadata": {},
   "source": [
    "CALCULATE SCORES "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a71c7149",
   "metadata": {},
   "outputs": [],
   "source": [
    "def obtain_scores(df, name):\n",
    "    \n",
    "    df = df.copy()\n",
    "    \n",
    "    df = df.dropna(subset=['cleaned_text'])\n",
    "    \n",
    "    tfidf = TfidfVectorizer(min_df=0.01,\n",
    "                        stop_words='english',\n",
    "                        use_idf=True)\n",
    "\n",
    "    X_tfidf = tfidf.fit_transform(df['cleaned_text'])\n",
    "    feature_names = tfidf.get_feature_names_out()\n",
    "\n",
    "\n",
    "    people_score = [0] * len(df['cleaned_text'])\n",
    "    elite_score = [0] * len(df['cleaned_text'])\n",
    "\n",
    "    for i in range(len(df['cleaned_text'])):\n",
    "        feature_index = X_tfidf[i, :].nonzero()[1]\n",
    "        tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])\n",
    "\n",
    "        for word_index, score in tfidf_scores:\n",
    "            word = feature_names[word_index]\n",
    "\n",
    "            if word in people:\n",
    "                people_score[i] += score  # Directly accumulate scores into the respective list\n",
    "            if word in elite:\n",
    "                elite_score[i] += score\n",
    "\n",
    "    df['elite'] = elite_score\n",
    "    df['people'] = people_score\n",
    "    \n",
    "    df['pop_dim2'] = np.where(df[['people', 'elite']].all(axis=1) == 0, 0,\n",
    "                          df[['people', 'elite']].sum(axis=1))\n",
    "    \n",
    "    df = df[['name', 'state_name', 'election_year', 'len_words' , 'pop_dim2']]\n",
    "    df.columns = ['name', 'state_name', 'election_year', 'len_words_' + name, name]\n",
    "\n",
    "\n",
    "    return df\n",
    "\n",
    "df1_f = obtain_scores(df1, 'pop_econ')\n",
    "df2_f = obtain_scores(df2, 'pop_econ_ext')\n",
    "df3_f = obtain_scores(df3, 'pop_other')\n",
    "df4_f = obtain_scores(df4, 'pop_all')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0592c88",
   "metadata": {},
   "source": [
    "FINAL SCORES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4183e977",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.merge(df4_f, df3_f, on=['name', 'state_name', 'election_year'], how='left')\n",
    "df = pd.merge(df, df2_f, on=['name', 'state_name', 'election_year'], how='left')\n",
    "df = pd.merge(df, df1_f, on=['name', 'state_name', 'election_year'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b6d6c416",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['len_words_pop_econ'] = df['len_words_pop_econ'].fillna(0)\n",
    "\n",
    "df.loc[df.len_words_pop_econ == 0, 'pop_econ'] = np.nan\n",
    "df.loc[df.len_words_pop_econ_ext == 0, 'pop_econ_ext'] = np.nan\n",
    "df.loc[df.len_words_pop_other == 0, 'pop_other'] = np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75cab09f",
   "metadata": {},
   "source": [
    "FINAL DATASET "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "926b480b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = [col.lower() for col in df.columns]\n",
    "df.rename(columns={'len_words_pop_all': 'len_words_all'}, inplace=True)\n",
    "df.rename(columns={'state': 'state_name'}, inplace=True)\n",
    "df['name'] = [a.strip('Â') for a in df['name']]\n",
    "df['name'] = df['name'].str.rstrip().str.lstrip()\n",
    "df.loc[df['name'] == \"McKenzie Levindowfske\", 'name'] = \"McKenzie Levindofske\"\n",
    "df.loc[df['name'] == \"Andrè Carson\", 'name'] = \"AndrÃ¨ Carson\"\n",
    "\n",
    "# Drop empty speeches\n",
    "df = df.loc[df.len_words_all>0]\n",
    "\n",
    "os.chdir('/Users/gloria/Dropbox/Progetti/Rhetoric/text analysis')\n",
    "df.to_csv('tfidf_pop_by_topic.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
